from pyspark.sql import SparkSession

if __name__ == "__main__":

    spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("sample2") \
        .getOrCreate()

    parquetDF = spark.read \
        .format("parquet") \
        .load("data/green_tripdata_2025-01.parquet")

    print(parquetDF.count())
    parquetDF.show(5)

    # coalesce(1) is used to save it as one file
    parquetDF.coalesce(1).write \
        .format("parquet") \
        .mode("overwrite") \
        .option("path", "data\\parquet\\") \
        .save()

    csvDF = spark.read \
        .format("csv") \
        .load("data/green_tripdata_2025-01.csv")

    print(csvDF.count())
    csvDF.show(5)

    # coalesce(1) is used to save it as one file
    csvDF.coalesce(1).write \
        .format("csv") \
        .mode("overwrite") \
        .option("path", "data\\csv\\") \
        .save()

    spark.stop()
